Most Common Mutations in Each Cancer Type (and Pan Cancer)

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:data.table':
#> 
#>     between, first, last
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ forcats   1.0.0     ✔ readr     2.1.5
#> ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
#> ✔ lubridate 1.9.3     ✔ tibble    3.2.1
#> ✔ purrr     1.0.2     ✔ tidyr     1.3.1
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::between()     masks data.table::between()
#> ✖ dplyr::filter()      masks stats::filter()
#> ✖ dplyr::first()       masks data.table::first()
#> ✖ lubridate::hour()    masks data.table::hour()
#> ✖ lubridate::isoweek() masks data.table::isoweek()
#> ✖ dplyr::lag()         masks stats::lag()
#> ✖ dplyr::last()        masks data.table::last()
#> ✖ lubridate::mday()    masks data.table::mday()
#> ✖ lubridate::minute()  masks data.table::minute()
#> ✖ lubridate::month()   masks data.table::month()
#> ✖ lubridate::quarter() masks data.table::quarter()
#> ✖ lubridate::second()  masks data.table::second()
#> ✖ purrr::transpose()   masks data.table::transpose()
#> ✖ lubridate::wday()    masks data.table::wday()
#> ✖ lubridate::week()    masks data.table::week()
#> ✖ lubridate::yday()    masks data.table::yday()
#> ✖ lubridate::year()    masks data.table::year()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

cancer_data <- as.data.frame(tcga_available())
cancer_dfs <- tibble(
  cancer_name = character(),
  data = list()
)

cancer_dfs_facet <- tibble(
  cancer_name = character(),
  data = list()
)

for (i in seq_along(cancer_data$Study_Abbreviation)) {
  # Load TCGA mutation data
  cancer_type <- cancer_data$Study_Abbreviation[i]
  cancer_name <- gsub("_", " ", cancer_data$Study_Name[i])
  mutations <- tcgaLoad(study = cancer_type)
  mutation_data <- as.data.frame(mutations@data)
  nucleotide_changes <- mutation_data[, c("Hugo_Symbol", "HGVSc")]

  tumor_samples <- mutation_data[, c("Tumor_Sample_Barcode")]

  num_tumors <- length(unique(tumor_samples))

  # group_by HGVSc and add frequency using mutate, then ungroup
  df <- nucleotide_changes %>%
    group_by(HGVSc) %>%
    mutate(freq = n()) %>%
    ungroup()

  # sort in descending order, distinct removes duplicate rows
  df_sorted <- df %>%
    arrange(desc(freq)) %>%
    distinct(HGVSc, .keep_all = TRUE)

  # Merge gene name and HGVSc into one column
  df_merged <- df_sorted %>%
    mutate(mutation = paste(Hugo_Symbol, HGVSc, sep = ", ")) %>%
    select(mutation, freq) %>%
    mutate(cancer_name_internal = cancer_name)


  df_merged_trunc <- df_merged[1:50, ]
  df_merged_trunc_facet <- df_merged[1:20, ]

  # Convert the mutation column to a factor
  df_merged_trunc$mutation <- factor(df_merged_trunc$mutation, levels = df_merged$mutation)
  df_merged_trunc$freq <- df_merged_trunc$freq / num_tumors

  df_merged_trunc_facet$mutation <- factor(df_merged_trunc_facet$mutation, levels = df_merged$mutation)
  df_merged_trunc_facet$freq <- df_merged_trunc_facet$freq / num_tumors

  # append dfs to list
  cancer_dfs <- cancer_dfs %>% add_row(cancer_name, data = list(df_merged_trunc))
  cancer_dfs_facet <- cancer_dfs_facet %>% add_row(cancer_name, data = list(df_merged_trunc_facet))
}
#> Loading ACC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading BLCA. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading BRCA. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading CESC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading CHOL. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading COAD. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading DLBC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading ESCA. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading GBM. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading HNSC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading KICH. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading KIRC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading KIRP. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading LAML. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading LGG. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading LIHC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading LUAD. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading LUSC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading MESO. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading OV. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading PAAD. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading PCPG. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading PRAD. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading READ. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading SARC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading SKCM. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading STAD. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading TGCT. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading THCA. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading THYM. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading UCEC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading UCS. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading UVM. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
library(ggplot2)
library(purrr)
library(gridExtra)
#> 
#> Attaching package: 'gridExtra'
#> The following object is masked from 'package:dplyr':
#> 
#>     combine

# Create the bar plot
create_plot <- function(df, name) {
  ggplot(data = df, aes(x = mutation, y = freq)) + # nolint: object_usage_linter.
    geom_bar(stat = "identity", fill = "blue") +
    geom_text(aes(label = round(freq, 3)), size = 3.5, hjust = 1.2, color = "white") +
    coord_flip() +
    labs(x = "Mutation", y = "Frequency", title = paste("Frequency of Mutations in ", name, sep = "")) +
    theme_minimal()
}

plots <- pmap(cancer_dfs, function(cancer_name, data) {
  create_plot(data, cancer_name)
})

plots
#> [[1]]

#> 
#> [[2]]

#> 
#> [[3]]

#> 
#> [[4]]

#> 
#> [[5]]

#> 
#> [[6]]

#> 
#> [[7]]

#> 
#> [[8]]

#> 
#> [[9]]

#> 
#> [[10]]

#> 
#> [[11]]

#> 
#> [[12]]

#> 
#> [[13]]

#> 
#> [[14]]

#> 
#> [[15]]

#> 
#> [[16]]

#> 
#> [[17]]

#> 
#> [[18]]

#> 
#> [[19]]

#> 
#> [[20]]

#> 
#> [[21]]

#> 
#> [[22]]

#> 
#> [[23]]

#> 
#> [[24]]

#> 
#> [[25]]

#> 
#> [[26]]

#> 
#> [[27]]

#> 
#> [[28]]

#> 
#> [[29]]

#> 
#> [[30]]

#> 
#> [[31]]

#> 
#> [[32]]

#> 
#> [[33]]

print(cancer_dfs_facet$data[3])
#> [[1]]
#> # A tibble: 20 × 3
#>    mutation             freq cancer_name_internal     
#>    <fct>               <dbl> <chr>                    
#>  1 PIK3CA, c.3140A>G  0.128  Breast invasive carcinoma
#>  2 PIK3CA, c.1633G>A  0.0673 Breast invasive carcinoma
#>  3 PIK3CA, c.1624G>A  0.0526 Breast invasive carcinoma
#>  4 OR6C4, c.3G>A      0.0409 Breast invasive carcinoma
#>  5 HIF3A, c.49G>A     0.0390 Breast invasive carcinoma
#>  6 NLN, c.304G>A      0.0292 Breast invasive carcinoma
#>  7 TP53, c.524G>A     0.0292 Breast invasive carcinoma
#>  8 CUL3, c.274G>A     0.0273 Breast invasive carcinoma
#>  9 FHL1, c.550G>A     0.0273 Breast invasive carcinoma
#> 10 DCTPP1, c.232G>A   0.0263 Breast invasive carcinoma
#> 11 HMGB2, c.97G>A     0.0263 Breast invasive carcinoma
#> 12 CHRNB4, c.664G>A   0.0263 Breast invasive carcinoma
#> 13 FZD2, c.364G>A     0.0253 Breast invasive carcinoma
#> 14 ZSCAN1, c.85G>A    0.0253 Breast invasive carcinoma
#> 15 PRKCE, c.256G>A    0.0253 Breast invasive carcinoma
#> 16 TRIM65, c.466G>A   0.0244 Breast invasive carcinoma
#> 17 BEST3, c.220G>A    0.0244 Breast invasive carcinoma
#> 18 RNASE12, c.211G>A  0.0244 Breast invasive carcinoma
#> 19 ZBED4, c.244G>A    0.0244 Breast invasive carcinoma
#> 20 TRAV38-1, c.301G>A 0.0244 Breast invasive carcinoma
# print(cancer_dfs_facet$cancer_name)
library(tidyverse)

# Split df into 4 because too big for one facet plot
cancer_types <- unique(cancer_dfs_facet$cancer_name)
plots_per_group <- 4
n_groups <- ceiling(length(cancer_types) / plots_per_group)
group_assignments <- rep(1:n_groups, each = plots_per_group)[seq_along(cancer_types)]
cancer_type_groups <- split(cancer_types, group_assignments)

create_facet_plot <- function(df_data, cancer_types_subset) {
  subset_data <- df_data %>% filter(cancer_name %in% cancer_types_subset) # nolint: object_usage_linter.

  list_of_tibbles <- subset_data$data
  combined_data <- bind_rows(list_of_tibbles)

  # Create plot
  ggplot(combined_data, aes(x = mutation, y = freq)) + # nolint: object_usage_linter.
    geom_bar(stat = "identity", fill = "blue") +
    geom_text(aes(label = round(freq, 3)), size = 3.5, hjust = 1.2, color = "white") +
    coord_flip() +
    labs(x = "Mutation", y = "Frequency") +
    facet_wrap(~cancer_name_internal, scales = "free") +
    theme_minimal()
}

# Create facet plots for each group of cancer types
facet_plots <- lapply(cancer_type_groups, function(cancer_types_subset) {
  create_facet_plot(cancer_dfs_facet, cancer_types_subset)
})


for (i in seq_along(facet_plots)) {
  print(facet_plots[[i]])
}

# Get all cancer types
cancer_types <- unique(cancer_data$Study_Abbreviation)

# Function to load mutations individually because TCGAmutations does not support loading all
load_mutations <- function(cancer_type) {
  mutations <- tcgaLoad(study = cancer_type) # nolint: object_usage_linter.
  mutation_data <- as.data.frame(mutations@data)
  mutation_data$cancer_type <- cancer_type
  return(mutation_data)
}

pan_cancer_list <- lapply(cancer_types, load_mutations)
#> Loading ACC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading BLCA. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading BRCA. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading CESC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading CHOL. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading COAD. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading DLBC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading ESCA. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading GBM. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading HNSC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading KICH. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading KIRC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading KIRP. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading LAML. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading LGG. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading LIHC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading LUAD. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading LUSC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading MESO. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading OV. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading PAAD. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading PCPG. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading PRAD. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading READ. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading SARC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading SKCM. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading STAD. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading TGCT. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading THCA. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading THYM. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading UCEC. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading UCS. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference
#> Loading UVM. Please cite: https://doi.org/10.1016/j.cels.2018.03.002 for reference

pan_cancer_df <- bind_rows(pan_cancer_list)
#> New names:
#> • `IMPACT` -> `IMPACT...19`
#> • `IMPACT` -> `IMPACT...21`
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> New names:
#> • `IMPACT` -> `IMPACT...19`
#> • `IMPACT` -> `IMPACT...21`
head(pan_cancer_df)
#>   Hugo_Symbol Chromosome Start_Position End_Position Variant_Classification
#> 1        OPN4         10       88419681     88419681      Missense_Mutation
#> 2       KLRB1         12        9760409      9760409      Missense_Mutation
#> 3       SALL2         14       21991730     21991730      Missense_Mutation
#> 4    C15orf27         15       76467904     76467904        Frame_Shift_Del
#> 5      KLHDC4         16       87742934     87742934      Missense_Mutation
#> 6       NOL11         17       65734087     65734087      Missense_Mutation
#>   Variant_Type Reference_Allele Tumor_Seq_Allele2         Tumor_Sample_Barcode
#> 1          SNP                G                 A TCGA-OR-A5J1-01A-11D-A29I-10
#> 2          SNP                C                 G TCGA-OR-A5J1-01A-11D-A29I-10
#> 3          SNP                C                 T TCGA-OR-A5J1-01A-11D-A29I-10
#> 4          DEL                C                 - TCGA-OR-A5J1-01A-11D-A29I-10
#> 5          SNP                C                 T TCGA-OR-A5J1-01A-11D-A29I-10
#> 6          SNP                A                 C TCGA-OR-A5J1-01A-11D-A29I-10
#>    Matched_Norm_Sample_Barcode     HGVSc HGVSp_Short   Transcript_ID
#> 1 TCGA-OR-A5J1-10A-01D-A29L-10  c.863G>A     p.G288D ENST00000372071
#> 2 TCGA-OR-A5J1-10A-01D-A29L-10   c.27G>C       p.E9D ENST00000229402
#> 3 TCGA-OR-A5J1-10A-01D-A29L-10 c.2132G>A     p.R711Q ENST00000327430
#> 4 TCGA-OR-A5J1-10A-01D-A29L-10 c.657delC     p.Y219* ENST00000388942
#> 5 TCGA-OR-A5J1-10A-01D-A29L-10 c.1384G>A     p.D462N ENST00000270583
#> 6 TCGA-OR-A5J1-10A-01D-A29L-10 c.1528A>C     p.S510R ENST00000253247
#>   Exon_Number t_ref_count t_alt_count n_ref_count n_alt_count IMPACT...19
#> 1        7/11         133          10         140           0    MODERATE
#> 2         1/6         166         113         186           0    MODERATE
#> 3         2/2          54          47          78           1    MODERATE
#> 4        8/11         121          70         113           0        HIGH
#> 5       10/12         122          45         112           0    MODERATE
#> 6       13/18          29          24          50           0    MODERATE
#>   ExAC_AF IMPACT...21 FILTER Tumor_Sample_Barcode_min cancer_type
#> 1       .    MODERATE   PASS             TCGA-OR-A5J1         ACC
#> 2       .    MODERATE   PASS             TCGA-OR-A5J1         ACC
#> 3       .    MODERATE   PASS             TCGA-OR-A5J1         ACC
#> 4       .        HIGH   PASS             TCGA-OR-A5J1         ACC
#> 5       .    MODERATE   PASS             TCGA-OR-A5J1         ACC
#> 6       .    MODERATE   PASS             TCGA-OR-A5J1         ACC
nucleotide_changes <- pan_cancer_df[, c("Hugo_Symbol", "HGVSc", "cancer_type")]

tumor_samples <- pan_cancer_df[, c("Tumor_Sample_Barcode")]

num_tumors <- length(unique(tumor_samples))
print(length(tumor_samples))
#> [1] 2147998
print(num_tumors)
#> [1] 10201

# group_by HGVSc and add frequency using mutate, then ungroup
df <- nucleotide_changes %>%
  group_by(HGVSc) %>%
  mutate(freq = n()) %>%
  ungroup()

# sort in descending order, distinct removes duplicate rows
df_sorted <- df %>%
  arrange(desc(freq)) %>%
  distinct(HGVSc, .keep_all = TRUE)

# Merge gene name and HGVSc into one column
df_merged <- df_sorted %>%
  mutate(mutation = paste(Hugo_Symbol, HGVSc, sep = ", ")) %>%
  select(mutation, freq, cancer_type)

df_merged_trunc <- df_merged[1:50, ]

# Convert the mutation column to a factor
df_merged_trunc$mutation <- factor(df_merged_trunc$mutation, levels = df_merged$mutation)
df_merged_trunc$freq <- df_merged_trunc$freq / num_tumors

df_merged_trunc
#> # A tibble: 50 × 3
#>    mutation           freq cancer_type
#>    <fct>             <dbl> <chr>      
#>  1 RBMXL2, c.3G>A   0.0863 BLCA       
#>  2 COL5A2, c.395G>A 0.0717 ACC        
#>  3 OR4K15, c.3G>T   0.0600 ACC        
#>  4 BRAF, c.1799T>A  0.0560 BLCA       
#>  5 ALG10, c.4G>A    0.0548 ACC        
#>  6 MROH5, c.145G>A  0.0539 ACC        
#>  7 GRIK5, c.331G>A  0.0528 BLCA       
#>  8 ZNF598, c.397G>A 0.0523 ACC        
#>  9 PTH, c.226G>A    0.0516 BLCA       
#> 10 OGG1, c.346G>A   0.0513 ACC        
#> # ℹ 40 more rows
pancan_plot <- ggplot(data = df_merged_trunc, aes(x = mutation, y = freq)) +
  geom_bar(stat = "identity", fill = "blue") +
  geom_text(aes(label = round(freq, 3)), size = 3.5, hjust = 1.2, color = "white") +
  coord_flip() +
  labs(x = "Mutation", y = "Frequency", title = "Pan-cancer Frequency of Mutations") +
  theme_minimal()

pancan_plot